generate_sim_data <- function(n = 500,
ct_ratio = 4,
tx_es = 2,
u_es = 4,
x_count = 3,
x_u_count = x_count,
lg_contributors = 2,
md_contributors = 1,
lg_var_pct = 0.6,
md_var_pct = 0.3,
binary_outcome = FALSE,
time_to_event = FALSE,
censoring_rate = 0.2,
seed = 123,
eval_models = FALSE) {
set.seed(seed)
if (lg_contributors + md_contributors > x_u_count) {
stop("Error: Large and medium contributor counts exceed total x_u_count.")
}
sm_var_pct <- 1 - lg_var_pct - md_var_pct
if (round(lg_var_pct + md_var_pct + sm_var_pct, 6) != 1) {
stop("Error: Variance proportions must sum to 1.")
}
sm_contributors <- x_u_count - lg_contributors - md_contributors
X <- as.data.frame(matrix(rnorm(n * x_count, mean = 0, sd = 1), nrow = n))
colnames(X) <- paste0("X", 1:x_count)
big_weight <- ifelse(lg_contributors > 0, sqrt(lg_var_pct / lg_contributors), 0)
medium_weight <- ifelse(md_contributors > 0, sqrt(md_var_pct / md_contributors), 0)
small_weight <- ifelse(sm_contributors > 0, sqrt(sm_var_pct / sm_contributors), 0)
U <- numeric(n)
if (lg_contributors > 0) {
U <- U + rowSums(X[, 1:lg_contributors, drop = FALSE] * big_weight)
}
if (md_contributors > 0) {
U <- U + rowSums(X[, (lg_contributors + 1):(lg_contributors + md_contributors), drop = FALSE] * medium_weight)
}
if (sm_contributors > 0) {
U <- U + rowSums(X[, (lg_contributors + md_contributors + 1):x_u_count, drop = FALSE] * small_weight)
}
find_alpha <- function(alpha) mean(plogis(U + alpha)) - (1 / (1 + ct_ratio))
alpha_opt <- uniroot(find_alpha, interval = c(-5, 5))$root
prob_treated <- plogis(U + alpha_opt)
Tx <- rbinom(n, 1, prob_treated)
if (time_to_event) {
base_hazard <- 0.1
hazard <- base_hazard * exp(log(tx_es) * Tx + log(u_es) * U)
time <- -log(runif(n)) / hazard
censoring_time <- quantile(time, probs = 1 - censoring_rate)
event <- as.integer(time <= censoring_time)
time <- pmin(time, censoring_time)
df <- data.frame(id = 1:n, Tx = Tx, U = U, Y = time, event = event, X)
}
else if (binary_outcome) {
logit_Y <- log(tx_es) * Tx + log(u_es) * U
prob_Y <- plogis(logit_Y) # Convert logit to probability
Y <- rbinom(n, 1, prob_Y) # Generate binary outcome
df <- data.frame(id = 1:n, Tx = Tx, U = U, Y = Y, X)
} else {
Y <- tx_es * Tx + u_es * U + rnorm(n, mean = 0, sd = 1)
df <- data.frame(id = 1:n, Tx = Tx, U = U, Y = Y, X)
}
x_vars <- paste0("X", 1:x_count)
formula_y_tx <- as.formula("Y ~ Tx")
formula_y_tx_u <- as.formula("Y ~ Tx + U")
formula_y_tx_x <- as.formula(paste("Y ~ Tx +", paste(x_vars, collapse = " + ")))
formula_tx_x <- as.formula(paste("Tx ~", paste(x_vars, collapse = " + ")))
models <- list()
if (eval_models) {
if (time_to_event) {
models$model_y_tx <- coxph(Surv(time, event) ~ Tx, data = df) |> broom::tidy()
models$model_y_tx_u <- coxph(Surv(time, event) ~ Tx + U, data = df) |> broom::tidy()
models$model_y_tx_x <- coxph(Surv(time, event) ~ Tx + ., data = df) |> broom::tidy()
} else if (binary_outcome) {
models$model_y_tx <- glm(formula_y_tx, data = df, family = binomial()) |> broom::tidy()
models$model_y_tx_u <- glm(formula_y_tx_u, data = df, family = binomial()) |> broom::tidy()
models$model_y_tx_x <- glm(formula_y_tx_x, data = df, family = binomial()) |> broom::tidy()
} else {
models$model_y_tx <- lm(formula_y_tx, data = df) |> broom::tidy()
models$model_y_tx_u <- lm(formula_y_tx_u, data = df) |> broom::tidy()
models$model_y_tx_x <- lm(formula_y_tx_x, data = df) |> broom::tidy()
}
}
return(list(
df = df,
formulas = list(
formula_y_tx = formula_y_tx,
formula_y_tx_u = formula_y_tx_u,
formula_y_tx_x = formula_y_tx_x,
formula_tx_x = formula_tx_x
),
models = models,
params = list(
n = n,
ct_ratio = ct_ratio,
tx_es = tx_es,
u_es = u_es,
x_count = x_count,
x_u_count = x_u_count,
time_to_event = time_to_event,
binary_outcome = binary_outcome,
censoring_rate = censoring_rate,
seed = seed,
alpha_opt = alpha_opt
)
))
}